孔祥喆(11811005) 林子昭(11812534)
All content is completed by us working together.
In this task,I mainly analyze document PatientInfo.csv,Time.csv,SearchTrend.csv,TimeAge.csv,TimeGender.csv and TimeProvince.csv.I do lots of data statistics and visualization to make data be clear for each document.As a consequence, we can find something interesting by some pictures.
Analyze the document PatientInfo.csv
import pandas as pd
import numpy as np
dataset = pd.read_csv(r"C:\Users\hp\Desktop\PatientInfo.csv")
dataset.head() #Preview the document
dataset.info()
dataset.describe() #A preliminary description of the data
dataset.isnull() #Find the null value
dataset.isnull().any(axis=0)#See if each column has an missing value
After a preliminary analysis of the data,I choose three characteristics of dataset and use appropriate methods to fill in the missing values
dataset['global_num'].fillna(method='pad') #使用前一个数值替代空值或者NA,就是NA前面最近的非空数值替换
dataset['sex'].fillna(method='pad') #使用前一个数值替代空值或者NA,就是NA前面最近的非空数值替换
dataset['contact_number'].fillna(method='pad') #使用前一个数值替代空值或者NA,就是NA前面最近的非空数值替换
from matplotlib import pyplot as plt
dataset['sex'].value_counts() #Count patients of different genders
dataset['infection_case'].value_counts()#Count patients of different infection_case
Visualize the number of different genders
plt.figure(figsize=(3, 8), dpi=80)
x=['male','female']
y=[1327,1707]
plt.bar(x,y,width=0.5, color=['deepskyblue','mediumslateblue'])
for a,b in zip(x,y):
plt.text(a, b+0.05, '%.0f' % b, ha='center', va= 'bottom',fontsize=15)
plt.xticks(fontsize=15)
plt.xlabel('sex',fontsize=20)
plt.ylabel('number',fontsize=20)
plt.yticks(fontsize=15)
plt.title('Different Sex Statistics',fontsize=20)
plt.show()
import matplotlib.pyplot as plt
import matplotlib
label_list = ["female", "male"]
size = [1707,1327]
color = ["mediumorchid", "deepskyblue"]
explode = [0.04, 0.04]
patches, l_text, p_text = plt.pie(size, explode=explode, colors=color, labels=label_list, labeldistance=1.1, autopct="%1.1f%%", shadow=True, startangle=90, pctdistance=0.6)
plt.axis("equal")
plt.title("Percentage",fontsize=25)
plt.legend()
for t in l_text:
t.set_size(20)
for t in p_text:
t.set_size(20)
plt.show()
By the picutres above,we can see the number of female patients is slightly higher than the number of male patients.
Visualize the number of different age
import seaborn as sns
dataset['age'].value_counts()
plt.figure(figsize=(10, 8), dpi=80)
x=['0s','10s','20s','30s','40s','50s','60s','70s','80s','90s','100s']
y=[43,126,722,389,432,562,366,189,149,44,1]
sns.barplot(x,y, palette='BuPu_r')
plt.xticks(fontsize=15)
plt.xlabel('Age',fontsize=20)
plt.ylabel('number',fontsize=20)
plt.yticks(fontsize=15)
plt.title('Different Age Statistics',fontsize=20)
plt.show()
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import cm
plt.figure(figsize=(7,12))
label_list = ["0s", "10s","20s","30s","40s","50s","60s","70s","80s","90s","100s"] # 各部分标签
size = [43,126,722,389,432,562,366,189,149,44,1] # 各部分大小
color = ["cyan", "darkturquoise","cadetblue","powderblue","deepskyblue","skyblue","lightskyblue","steelblue","dodgerblue","royalblue","darkorchid","violet"] # 各部分颜色
explode = [0.01, 0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01]
colorss = cm.rainbow(np.arange(len(size))/len(size))
patches, l_text, p_text = plt.pie(size, explode=explode, colors=colorss, labels=label_list, labeldistance=1.05, autopct="%1.1f%%", shadow=True, startangle=90,radius=50, pctdistance=0.6)
plt.axis("equal") # 设置横轴和纵轴大小相等,这样饼才是圆的
plt.title("Percentage",fontsize=20)
plt.legend()
for t in l_text:
t.set_size(10)
for t in p_text:
t.set_size(10)
plt.show()
By the pictures above,we can see a rough distribution of different age.Among it 20s and 50s are two main parts in patients.
Visualize male and female patients of different ages
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from pandas import Series,DataFrame
sns.set()
dfd=pd.read_csv(r"C:\Users\hp\Desktop\PatientInfo.csv")
Count male
import csv
cout1=0
cout2=0
cout3=0
cout4=0
cout5=0
cout6=0
cout7=0
cout8=0
cout9=0
cout10=0
cout11=0
cout12=0
with open(r"C:\Users\hp\Desktop\PatientInfo.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["sex"]=='male'and row["age"]=='0s'):
cout1=cout1+1
if(row["sex"]=='male' and row["age"]=='10s'):
cout2=cout2+1
if(row["sex"]=='male' and row["age"]=='20s'):
cout3=cout3+1
if(row["sex"]=='male' and row["age"]=='30s'):
cout4=cout4+1
if(row["sex"]=='male' and row["age"]=='40s'):
cout5=cout5+1
if(row["sex"]=='male' and row["age"]=='50s'):
cout6=cout6+1
if(row["sex"]=='male' and row["age"]=='60s'):
cout7=cout7+1
if(row["sex"]=='male' and row["age"]=='66s'):
cout8=cout8+1
if(row["sex"]=='male' and row["age"]=='70s'):
cout9=cout9+1
if(row["sex"]=='male' and row["age"]=='80s'):
cout10=cout10+1
if(row["sex"]=='male' and row["age"]=='90s'):
cout11=cout11+1
if(row["sex"]=='male' and row["age"]=='100s'):
cout12=cout12+1
print(cout1)
print(cout2)
print(cout3)
print(cout4)
print(cout5)
print(cout6)
print(cout7)
print(cout8)
print(cout9)
print(cout10)
print(cout11)
print(cout12)
count female
import csv
cout1=0
cout2=0
cout3=0
cout4=0
cout5=0
cout6=0
cout7=0
cout8=0
cout9=0
cout10=0
cout11=0
cout12=0
with open(r"C:\Users\hp\Desktop\PatientInfo.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["sex"]=='female'and row["age"]=='0s'):
cout1=cout1+1
if(row["sex"]=='female' and row["age"]=='10s'):
cout2=cout2+1
if(row["sex"]=='female' and row["age"]=='20s'):
cout3=cout3+1
if(row["sex"]=='female' and row["age"]=='30s'):
cout4=cout4+1
if(row["sex"]=='female' and row["age"]=='40s'):
cout5=cout5+1
if(row["sex"]=='female' and row["age"]=='50s'):
cout6=cout6+1
if(row["sex"]=='female' and row["age"]=='60s'):
cout7=cout7+1
if(row["sex"]=='female' and row["age"]=='70s'):
cout9=cout9+1
if(row["sex"]=='female' and row["age"]=='80s'):
cout10=cout10+1
if(row["sex"]=='female' and row["age"]=='90s'):
cout11=cout11+1
if(row["sex"]=='female' and row["age"]=='100s'):
cout12=cout12+1
print(cout1)
print(cout2)
print(cout3)
print(cout4)
print(cout5)
print(cout6)
print(cout7)
print(cout9)
print(cout10)
print(cout11)
print(cout12)
plt.figure(figsize=(10,8))
sex2=['male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female','male','female']
age2=['0s','0s','10s','10s','20s','20s','30s','30s','40s','40s','50s','50s','60s','60s','70s','70s','80s','80s','90s','90s','100s','100s']
num=[27,16,71,55,342,379,197,192,168,264,219,343,165,201,72,117,47,102,14,30,0,1]
sns.barplot(x=age2,y=num,hue=sex2)
plt.xlabel('age')
plt.ylabel('patient_number')
By the picture above,there are more female patients than male patients in most age groups, which is why there are more female patients than male patients overall.
Count the number of patients in different cities
pd.set_option('display.max_columns', None)
from IPython.display import display
pd.set_option('max_columns',1000)
pd.set_option('max_row',300)
display(dataset['city'].value_counts())
I just take top 5 cities to visualize
plt.figure(figsize=(10,8))
pro1=['Gyeongsan-si','Seonganam-si','Cheonan-si','Bonghwa-gun','Bucheon-si']
numm1=[628,118,106,71,70]
sns.barplot(x=pro1,y=numm1)
plt.xlabel('City')
plt.ylabel('Patient_number')
Count the number of patients in different provinces
dataset['province'].value_counts()
plt.figure(figsize=(26,15))
pro=['Gyeongsangbuk-do','Gyeonggi-do','Seoul','Chungcheongnam-do','Busan','Gyeongsangnam-do','Incheon','Daegu','Sejong','Chungcheongbuk-do','Ulsan','Daejeon','Gangwon-do','Gwangju','Jeollabuk-do','Jeollanam-do','Jeju-do']
numm=[1182,601,560,138,122,110,80,63,46,44,40,39,37,27,15,15,9]
sns.barplot(x=pro,y=numm)
plt.xlabel('Province')
plt.ylabel('Patient_number')
By the picture above,there are three main provinces whose patients are much more than other province.
Count the patients of different infectino_case and state
dataset['infection_case'].value_counts()
dataset['state'].value_counts()
dataset['infection_case'].value_counts()
Visulize the infection_case
plt.figure(figsize=(10,8))
pro1=['contact','overseas','Guro-gu Call Center','Shinchenonji Church','etc']
numm1=[862,475,112,105,755]
sns.barplot(x=pro1,y=numm1)
plt.xlabel('Infection_case')
plt.ylabel('Patient_number')
By the picture above,contacting is still the worse case to spread virus.Besides,overseas spreading is also a big transmission route of the virus.In Guro-gu Call Center and Shinchenonji Church,There was a cluster of infections.There factors are necessary to take seriously.
Visulize the state
plt.figure(figsize=(3,8))
pro=['isolated','released','deceased']
numm=[1934,1133,61]
sns.barplot(x=pro,y=numm)
plt.xlabel('State')
plt.ylabel('Patient_number')
By the picture above,most patients are still isolated and quite a few patients are released.Only few patients are deceased.
Count the number of patients in different state and sex
import csv
cout1=0
cout2=0
cout3=0
cout4=0
cout5=0
cout6=0
cout7=0
cout8=0
cout9=0
cout10=0
cout11=0
cout12=0
with open(r"C:\Users\hp\Desktop\PatientInfo.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["sex"]=='female'and row["state"]=='isolated'):
cout1=cout1+1
if(row["sex"]=='female' and row["state"]=='released'):
cout2=cout2+1
if(row["sex"]=='female' and row["state"]=='deceased'):
cout3=cout3+1
if(row["sex"]=='male' and row["state"]=='isolated'):
cout4=cout4+1
if(row["sex"]=='male' and row["state"]=='released'):
cout5=cout5+1
if(row["sex"]=='male' and row["state"]=='deceased'):
cout6=cout6+1
print(cout1)
print(cout2)
print(cout3)
print(cout4)
print(cout5)
print(cout6)
Visulize the number of patients in different sex and state
plt.figure(figsize=(8,10))
sex2=['male','female','male','female','male','female']
age2=['isolated','isolated','released','released','deceased','deceased']
num=[814,1028,472,659,41,20]
sns.barplot(x=age2,y=num,hue=sex2)
plt.xlabel('State')
plt.ylabel('Patient_number')
As expected,fenale patients are more than male patients in isolated and released.Only deceased male patients are little more than female patients.
Deal with and Visuliaze Time.csv
Count the number of different state
import csv
with open(r"C:\Users\hp\Desktop\Time.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
test = [int(row['test']) for row in reader]
print (test)
import csv
with open(r"C:\Users\hp\Desktop\Time.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
confirmed = [int(row['confirmed']) for row in reader]
print (confirmed)
import csv
with open(r"C:\Users\hp\Desktop\Time.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
negative = [int(row['negative']) for row in reader]
print (negative)
Visulize confirmed patients and time
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
confirmed=[1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18, 23, 24, 24, 27, 27, 28, 28, 28, 28, 28, 29, 30, 31, 51, 104, 204, 433, 602, 833, 977, 1261, 1766, 2337, 3150, 3736, 4212, 4812, 5328, 5766, 6284, 6767, 7134, 7382, 7513, 7755, 7869, 7979, 8086, 8126, 8236, 8320, 8413, 8565, 8652, 8799, 8897, 8961, 9037, 9137, 9241, 9332, 9478, 9583, 9661, 9786, 9887, 9976, 10062, 10156, 10237, 10284, 10331]
test=[1, 1, 4, 22, 27, 27, 51, 61, 116, 187, 246, 312, 371, 429, 490, 607, 714, 885, 1352, 2097, 2598, 3110, 4325, 5624, 6511, 7242, 7734, 8161, 8718, 9772, 11173, 13202, 16400, 21586, 26179, 32756, 40304, 53553, 66652, 81167, 94055, 98921, 109591, 125851, 136707, 146541, 164740, 178189, 188518, 196618, 210144, 222395, 234998, 248647, 261335, 268212, 274504, 286716, 295647, 307024, 316664, 327509, 331780, 338036, 348582, 357896, 364942, 376961, 387925, 394141, 395194, 410564, 421547, 431743, 443273, 455032, 461233, 466804, 477304]
negative=[0, 0, 3, 21, 25, 25, 47, 56, 97, 155, 199, 245, 289, 327, 414, 462, 522, 693, 1001, 1134, 1683, 2552, 3535, 4811, 5921, 6679, 7148, 7647, 7980, 8923, 9973, 11238, 13016, 15116, 17520, 20292, 25447, 31576, 39318, 48593, 55723, 61825, 71580, 85484, 102965, 118965, 136624, 151802, 162008, 171778, 184179, 196100, 209402, 222728, 235615, 243778, 251297, 261105, 270888, 282555, 292487, 303006, 308343, 315447, 324105, 334481, 341332, 352410, 361883, 369530, 372002, 383886, 395075, 403882, 414303, 424732, 431425, 437225, 446323]
std=datetime.datetime(2020,1,19)
edd=datetime.datetime(2020,4,7)
jida=datetime.timedelta(days=1)
dates=mat.dates.drange(std,edd,jida)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(dates,confirmed)
plt.xlabel('Date')
plt.ylabel('Patient_number')
legend=plt.legend(["confirmed"])
import pandas as pd
import numpy as np
datase = pd.read_csv(r"C:\Users\hp\Desktop\Time.csv")
Visulize three different state with time
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
confirmed=[1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18, 23, 24, 24, 27, 27, 28, 28, 28, 28, 28, 29, 30, 31, 51, 104, 204, 433, 602, 833, 977, 1261, 1766, 2337, 3150, 3736, 4212, 4812, 5328, 5766, 6284, 6767, 7134, 7382, 7513, 7755, 7869, 7979, 8086, 8126, 8236, 8320, 8413, 8565, 8652, 8799, 8897, 8961, 9037, 9137, 9241, 9332, 9478, 9583, 9661, 9786, 9887, 9976, 10062, 10156, 10237, 10284, 10331]
test=[1, 1, 4, 22, 27, 27, 51, 61, 116, 187, 246, 312, 371, 429, 490, 607, 714, 885, 1352, 2097, 2598, 3110, 4325, 5624, 6511, 7242, 7734, 8161, 8718, 9772, 11173, 13202, 16400, 21586, 26179, 32756, 40304, 53553, 66652, 81167, 94055, 98921, 109591, 125851, 136707, 146541, 164740, 178189, 188518, 196618, 210144, 222395, 234998, 248647, 261335, 268212, 274504, 286716, 295647, 307024, 316664, 327509, 331780, 338036, 348582, 357896, 364942, 376961, 387925, 394141, 395194, 410564, 421547, 431743, 443273, 455032, 461233, 466804, 477304]
negative=[0, 0, 3, 21, 25, 25, 47, 56, 97, 155, 199, 245, 289, 327, 414, 462, 522, 693, 1001, 1134, 1683, 2552, 3535, 4811, 5921, 6679, 7148, 7647, 7980, 8923, 9973, 11238, 13016, 15116, 17520, 20292, 25447, 31576, 39318, 48593, 55723, 61825, 71580, 85484, 102965, 118965, 136624, 151802, 162008, 171778, 184179, 196100, 209402, 222728, 235615, 243778, 251297, 261105, 270888, 282555, 292487, 303006, 308343, 315447, 324105, 334481, 341332, 352410, 361883, 369530, 372002, 383886, 395075, 403882, 414303, 424732, 431425, 437225, 446323]
axis=plt.gca()
starttime=datetime.datetime(2020,1,19)
endtime=datetime.datetime(2020,4,7)
jianju=datetime.timedelta(days=1)
daes=mat.dates.drange(starttime,endtime,jianju)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(daes,test)
axis.plot(daes,negative)
axis.plot(daes,confirmed)
legend=plt.legend(["test","negative","confirmed"])
plt.xlabel('Date')
plt.ylabel('Patient_number')
Around February 18, 2020,test and negative patients begin inflection point and have rapid growth.
Deal with and visulize SearchTrend.csv
Since the outbreak probably occurred on January 16, 2020,I take information from January 16,2020 and create a new document called Search.csv
Count the different number of states
import pandas as pd
import numpy as np
datase = pd.read_csv(r"C:\Users\hp\Desktop\Search.csv")
import csv
with open(r"C:\Users\hp\Desktop\Search.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
date = [float(row['cold']) for row in reader]
print (date)
import csv
with open(r"C:\Users\hp\Desktop\Search.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
date = [float(row['flu']) for row in reader]
print (date)
import csv
with open(r"C:\Users\hp\Desktop\Search.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
date = [float(row['pneumonia']) for row in reader]
print (date)
import csv
with open(r"C:\Users\hp\Desktop\Search.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
date = [float(row['coronavirus']) for row in reader]
print (date)
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
cold=[0.14345,0.15217,0.19217, 0.22462, 0.23808, 0.30308, 0.34689, 0.70888, 0.96569, 1.39513, 1.56922, 1.19978, 0.98978, 1.35841, 1.29296, 1.4925, 1.20932, 1.06169, 1.09323, 1.03724, 0.91715, 0.89597, 0.77052, 0.63816, 0.5338, 0.47862, 0.38444, 0.32153, 0.35944, 0.35171, 0.32426, 0.32108, 0.59207, 0.98306, 1.42804, 1.91594, 1.94203, 1.78603, 1.67058, 1.42177, 1.2255, 1.24941, 1.3085, 1.23605, 0.8816, 0.67934, 0.82579, 0.64916, 0.58625, 0.71743, 0.58752, 0.45435, 0.46934, 0.70143, 0.55771, 0.40744, 0.50816, 5.0273, 0.52925, 0.49134, 0.44625, 0.47171, 0.44453, 0.4738, 0.94142, 0.36689, 0.29562, 0.29508, 0.27471, 0.28517, 0.32717, 0.32462, 0.31962, 0.30762, 0.2289, 0.22862, 0.22453, 0.26708, 0.26281, 0.20717]
flu=[0.42089,0.39953,0.70343, 0.59789, 0.56661, 0.55625, 0.40226, 0.39744, 0.40126, 0.50253, 0.64298, 0.49207, 0.44953, 0.84061, 0.79288, 0.45298, 0.49516, 0.37907, 0.34444, 0.31644, 0.25808, 0.18081, 0.17208, 0.21499, 0.17072, 0.15281, 0.13999, 0.11808, 0.10026, 0.0899, 0.12045, 0.11963, 0.13172, 0.17026, 0.17808, 0.19472, 0.19326, 0.22708, 0.20181, 0.19281, 0.18535, 0.16544, 0.16108, 0.13754, 0.13345, 0.11917, 0.1079, 0.09745, 0.09163, 0.0769, 0.06717, 0.09263, 0.09108, 0.08945, 0.10008, 0.07808, 0.0689, 0.06754, 0.12036, 0.2749, 0.15999, 0.10754, 0.14426, 0.09381, 0.07108, 0.09354, 0.0739, 0.06099, 0.06699, 0.07836, 0.06272, 0.06754, 0.06354, 0.05954, 0.06018, 0.05527, 0.05118, 0.0549, 0.04918, 0.05763]
pne=[0.55216,0.61598,3.63716, 4.31987, 3.66416, 3.18035, 2.48156, 3.40926, 3.43989, 4.38777, 4.5434, 2.73, 2.17811, 2.1802, 1.44032, 1.37923, 1.33886, 0.99397, 0.85588, 0.79524, 0.80733, 0.53789, 0.46189, 0.50871, 0.40544, 0.37889, 0.37753, 0.2769, 0.25471, 0.35953, 0.45144, 1.00833, 0.96769, 1.15151, 1.17051, 1.27032, 1.15841, 1.17887, 1.02206, 0.87142, 0.75252, 0.6417, 0.52989, 0.44589, 0.4408, 0.39853, 0.35671, 0.31726, 0.28071, 0.22762, 0.20553, 0.23962, 0.28635, 0.30208, 0.26335, 0.22799, 0.19108, 0.16644, 0.20826, 0.24172, 0.56771, 1.10896, 0.88197, 0.28426, 0.20372, 0.24172, 0.19799, 0.18535, 0.16899, 0.17272, 0.1399, 0.12863, 0.1789, 0.18817, 0.1709, 0.16108, 0.15517, 0.1209, 0.11436, 0.14026]
cor=[0.26262,0.32662,20.6961, 35.33284, 29.74474, 100.0, 86.11541, 62.84847, 39.62772, 57.07446, 72.63073, 52.06261, 45.93889, 58.4187, 41.5073, 41.73647, 50.1994, 37.28806, 35.23094, 39.3959, 31.71404, 40.46614, 36.07691, 52.11234, 18.13918, 15.90997, 15.08472, 16.94803, 10.34468, 9.79906, 10.98521, 14.97945, 40.75986, 59.21885, 72.15665, 85.23907, 82.90014, 79.89478, 79.95441, 71.26268, 71.07204, 66.20619, 61.04053, 56.20422, 60.86853, 47.17367, 39.84053, 33.24218, 29.61647, 26.90492, 26.4902, 29.79228, 28.49514, 24.02346, 20.56511, 19.29851, 16.62577, 14.92164, 16.8914, 14.70828, 13.9044, 18.50553, 13.56359, 11.28111, 11.1433, 11.56656, 10.87394, 10.51795, 10.38105, 10.15869, 10.25105, 8.86464, 8.6911, 8.30075, 7.70922, 7.50622, 7.44741, 6.04172, 5.55301, 10.90885]
axis=plt.gca()
starttime=datetime.datetime(2020,1,17)
endtime=datetime.datetime(2020,4,6)
jianju=datetime.timedelta(days=1)
daes=mat.dates.drange(starttime,endtime,jianju)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(daes,cold)
axis.plot(daes,flu)
axis.plot(daes,pne)
axis.plot(daes,cor)
legend=plt.legend(["cold","flu","pneumonia","coronavirus"])
plt.xlabel('Date')
plt.ylabel('Search_trend')
We can see the search trend of different state.Paticularly,'coronavirus' is a most important part in people's eyes.
Count the number of patients in different age and state is 'confirmed' in TimeAge.csv
import csv
cout1=[]
cout2=[]
cout3=[]
cout4=[]
cout5=[]
cout6=[]
cout7=[]
cout8=[]
cout9=[]
cout10=0
cout11=0
cout12=0
with open(r"C:\Users\hp\Desktop\TimeAge.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["age"]=='0s'):
cout1.append(int(row["confirmed"]))
if(row["age"]=='10s'):
cout2.append(int(row["confirmed"]))
if(row["age"]=='20s'):
cout3.append(int(row["confirmed"]))
if(row["age"]=='30s'):
cout4.append(int(row["confirmed"]))
if(row["age"]=='40s'):
cout5.append(int(row["confirmed"]))
if(row["age"]=='50s'):
cout6.append(int(row["confirmed"]))
if(row["age"]=='60s'):
cout7.append(int(row["confirmed"]))
if(row["age"]=='70s'):
cout8.append(int(row["confirmed"]))
if(row["age"]=='80s'):
cout9.append(int(row["confirmed"]))
print(cout1)
print(len(cout1))
print(cout2)
print(len(cout2))
print(cout3)
print(len(cout3))
print(cout4)
print(len(cout4))
print(cout5)
print(len(cout5))
print(cout6)
print(len(cout6))
print(cout7)
print(len(cout7))
print(cout8)
print(len(cout8))
print(cout9)
print(len(cout9))
Visualision
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
axis.xaxis.set_major_formatter(dateFormat)
confirmed=[1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18, 23, 24, 24, 27, 27, 28, 28, 28, 28, 28, 29, 30, 31, 51, 104, 204, 433, 602, 833, 977, 1261, 1766, 2337, 3150, 3736, 4212, 4812, 5328, 5766, 6284, 6767, 7134, 7382, 7513, 7755, 7869, 7979, 8086, 8126, 8236, 8320, 8413, 8565, 8652, 8799, 8897, 8961, 9037, 9137, 9241, 9332, 9478, 9583, 9661, 9786, 9887, 9976, 10062, 10156, 10237, 10284, 10331]
test=[1, 1, 4, 22, 27, 27, 51, 61, 116, 187, 246, 312, 371, 429, 490, 607, 714, 885, 1352, 2097, 2598, 3110, 4325, 5624, 6511, 7242, 7734, 8161, 8718, 9772, 11173, 13202, 16400, 21586, 26179, 32756, 40304, 53553, 66652, 81167, 94055, 98921, 109591, 125851, 136707, 146541, 164740, 178189, 188518, 196618, 210144, 222395, 234998, 248647, 261335, 268212, 274504, 286716, 295647, 307024, 316664, 327509, 331780, 338036, 348582, 357896, 364942, 376961, 387925, 394141, 395194, 410564, 421547, 431743, 443273, 455032, 461233, 466804, 477304]
negative=[0, 0, 3, 21, 25, 25, 47, 56, 97, 155, 199, 245, 289, 327, 414, 462, 522, 693, 1001, 1134, 1683, 2552, 3535, 4811, 5921, 6679, 7148, 7647, 7980, 8923, 9973, 11238, 13016, 15116, 17520, 20292, 25447, 31576, 39318, 48593, 55723, 61825, 71580, 85484, 102965, 118965, 136624, 151802, 162008, 171778, 184179, 196100, 209402, 222728, 235615, 243778, 251297, 261105, 270888, 282555, 292487, 303006, 308343, 315447, 324105, 334481, 341332, 352410, 361883, 369530, 372002, 383886, 395075, 403882, 414303, 424732, 431425, 437225, 446323]
c1=[32, 34, 34, 38, 45, 52, 58, 66, 67, 75, 76, 77, 81, 83, 85, 86, 87, 91, 97, 99, 101, 103, 105, 105, 106, 108, 109, 111, 112, 112, 116, 119, 121, 122, 124, 126, 126]
c2=[169, 204, 233, 257, 292, 327, 360, 381, 393, 405, 412, 421, 424, 427, 432, 436, 438, 444, 452, 457, 460, 460, 468, 475, 488, 496, 501, 508, 513, 515, 519, 522, 528, 530, 535, 542, 544]
c3=[1235, 1417, 1575, 1727, 1877, 2038, 2133, 2190, 2213, 2238, 2261, 2274, 2287, 2301, 2313, 2330, 2342, 2358, 2365, 2380, 2396, 2417, 2438, 2473, 2508, 2532, 2567, 2602, 2630, 2656, 2682, 2704, 2734, 2761, 2789, 2804, 2819]
c4=[506, 578, 631, 659, 693, 727, 760, 779, 789, 804, 812, 823, 833, 842, 849, 856, 873, 886, 893, 900, 909, 917, 921, 943, 955, 960, 978, 993, 1002, 1012, 1027, 1043, 1052, 1066, 1083, 1086, 1092]
c5=[633, 713, 790, 847, 889, 941, 975, 1005, 1030, 1082, 1101, 1117, 1133, 1141, 1147, 1164, 1171, 1181, 1193, 1203, 1221, 1228, 1234, 1246, 1252, 1256, 1278, 1292, 1297, 1312, 1323, 1336, 1350, 1358, 1370, 1375, 1382]
c6=[834, 952, 1051, 1127, 1217, 1287, 1349, 1391, 1416, 1472, 1495, 1523, 1551, 1568, 1585, 1602, 1615, 1642, 1656, 1672, 1691, 1702, 1716, 1724, 1738, 1752, 1780, 1798, 1812, 1851, 1865, 1878, 1887, 1898, 1904, 1906, 1909]
c7=[530, 597, 646, 699, 763, 830, 878, 916, 929, 960, 972, 985, 999, 1012, 1024, 1033, 1059, 1080, 1099, 1118, 1132, 1139, 1146, 1154, 1162, 1170, 1201, 1210, 1218, 1235, 1245, 1258, 1266, 1282, 1289, 1294, 1304]
c8=[192, 224, 260, 288, 340, 384, 409, 438, 454, 483, 497, 506, 515, 525, 531, 539, 542, 562, 568, 589, 595, 599, 608, 611, 616, 630, 632, 635, 640, 651, 658, 663, 668, 678, 681, 686, 689]
c9=[81, 93, 108, 124, 168, 191, 212, 216, 222, 236, 243, 253, 263, 263, 270, 274, 286, 321, 329, 381, 392, 396, 401, 406, 416, 428, 432, 434, 437, 442, 452, 453, 456, 461, 462, 465, 466]
axis=plt.gca()
starttime=datetime.datetime(2020,3,1)
endtime=datetime.datetime(2020,4,7)
jianju=datetime.timedelta(days=1)
daes=mat.dates.drange(starttime,endtime,jianju)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(daes,c1)
axis.plot(daes,c2)
axis.plot(daes,c3)
axis.plot(daes,c4)
axis.plot(daes,c5)
axis.plot(daes,c6)
axis.plot(daes,c7)
axis.plot(daes,c8)
axis.plot(daes,c9)
axis.xaxis.set_major_formatter(dateFormat)
legend=plt.legend(["0s","10s","20s","30s","40s","50s","60s","70s","80s"])
plt.xlabel('Date')
plt.ylabel('Different_age_number')
Visulize sex and 'confirmed' in TimeGender.csv
import csv
cout1=[]
cout2=[]
with open(r"C:\Users\hp\Desktop\TimeGender.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["sex"]=='male'):
cout1.append(int(row["confirmed"]))
if(row["sex"]=='female'):
cout2.append(int(row["confirmed"]))
print(cout1)
print(len(cout1))
print(cout2)
print(len(cout2))
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
x1=[1591, 1810, 1996, 2149, 2345, 2522, 2694, 2799, 2852, 2947, 2994, 3043, 3100, 3136, 3169, 3200, 3240, 3296, 3330, 3387, 3430, 3457, 3497, 3550, 3598, 3638, 3736, 3799, 3834, 3905, 3946, 3979, 4013, 4052, 4098, 4118, 4138]
x2=[2621, 3002, 3332, 3617, 3939, 4245, 4440, 4583, 4661, 4808, 4875, 4936, 4986, 5026, 5067, 5120, 5173, 5269, 5322, 5412, 5467, 5504, 5540, 5587, 5643, 5694, 5742, 5784, 5827, 5881, 5941, 5997, 6049, 6104, 6139, 6166, 6193]
axis=plt.gca()
starttime=datetime.datetime(2020,3,1)
endtime=datetime.datetime(2020,4,7)
jianju=datetime.timedelta(days=1)
daes=mat.dates.drange(starttime,endtime,jianju)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(daes,x1)
axis.plot(daes,x2)
plt.xlabel('Date')
plt.ylabel('Confirmed_number')
legend=plt.legend(["male","female"])
Visualize province and 'confirmed' in TimeProvince.csv
import csv
cout1=[]
cout2=[]
cout3=[]
cout4=[]
cout5=[]
cout6=[]
cout7=[]
cout8=[]
cout9=[]
cout10=[]
cout11=[]
cout12=[]
cout13=[]
cout14=[]
cout15=[]
cout16=[]
cout17=[]
with open(r"C:\Users\hp\Desktop\TimeProvince.csv",'r') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if(row["province"]=='Seoul'):
cout1.append(int(row["confirmed"]))
if(row["province"]=='Busan'):
cout2.append(int(row["confirmed"]))
if(row["province"]=='Daegu'):
cout3.append(int(row["confirmed"]))
if(row["province"]=='Incheon'):
cout4.append(int(row["confirmed"]))
if(row["province"]=='Gwangju'):
cout5.append(int(row["confirmed"]))
if(row["province"]=='Daejeon'):
cout6.append(int(row["confirmed"]))
if(row["province"]=='Ulsan'):
cout7.append(int(row["confirmed"]))
if(row["province"]=='Sejong'):
cout8.append(int(row["confirmed"]))
if(row["province"]=='Gyeonggi-do'):
cout9.append(int(row["confirmed"]))
if(row["province"]=='Gangwon-do'):
cout10.append(int(row["confirmed"]))
if(row["province"]=='Chungcheongbuk-do'):
cout11.append(int(row["confirmed"]))
if(row["province"]=='Chungcheongnam-do'):
cout12.append(int(row["confirmed"]))
if(row["province"]=='Jeollabuk-do'):
cout13.append(int(row["confirmed"]))
if(row["province"]=='Jeollanam-do'):
cout14.append(int(row["confirmed"]))
if(row["province"]=='Gyeongsangbuk-do'):
cout15.append(int(row["confirmed"]))
if(row["province"]=='Gyeongsangnam-do'):
cout16.append(int(row["confirmed"]))
if(row["province"]=='Jeju-do'):
cout17.append(int(row["confirmed"]))
print(cout1)
print(len(cout1))
print(cout2)
print(len(cout2))
print(cout3)
print(len(cout3))
print(cout4)
print(len(cout4))
print(cout5)
print(len(cout5))
print(cout6)
print(len(cout6))
print(cout7)
print(len(cout7))
print(cout8)
print(len(cout8))
print(cout9)
print(len(cout9))
print(cout10)
print(len(cout10))
print(cout11)
print(len(cout11))
print(cout12)
print(len(cout12))
print(cout13)
print(len(cout13))
print(cout14)
print(len(cout14))
print(cout15)
print(len(cout15))
print(cout16)
print(len(cout16))
print(cout17)
print(len(cout17))
import matplotlib as mat
import datetime
f=plt.figure(figsize=(15,8))
p1=[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 4, 4, 5, 5, 5, 5, 10, 11, 11, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 22, 27, 30, 31, 35, 41, 49, 56, 62, 77, 87, 91, 98, 99, 103, 105, 108, 120, 130, 141, 193, 212, 225, 238, 247, 253, 265, 270, 282, 299, 314, 324, 330, 334, 347, 360, 372, 390, 410, 426, 450, 474, 488, 506, 528, 552, 563, 567]
p2=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 11, 19, 34, 42, 58, 61, 65, 80, 83, 88, 90, 92, 92, 95, 96, 96, 96, 96, 98, 99, 100, 103, 106, 107, 107, 107, 107, 108, 108, 108, 109, 111, 112, 112, 113, 114, 117, 118, 119, 122, 122, 122, 122, 122, 122, 123]
p3=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 30, 110, 193, 310, 483, 543, 710, 1132, 1580, 2236, 2704, 3081, 3601, 4007, 4328, 4694, 5084, 5381, 5571, 5663, 5794, 5867, 5928, 5990, 6031, 6066, 6098, 6144, 6241, 6275, 6344, 6387, 6411, 6442, 6456, 6482, 6516, 6587, 6610, 6624, 6684, 6704, 6725, 6734, 6761, 6768, 6781, 6794]
p4=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 6, 6, 7, 7, 9, 9, 9, 9, 9, 9, 13, 25, 25, 27, 28, 30, 30, 31, 32, 32, 36, 40, 40, 40, 41, 42, 43, 46, 51, 58, 58, 64, 69, 73, 74, 77, 79, 80, 80]
p5=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 6, 8, 9, 9, 9, 9, 9, 9, 9, 9, 11, 13, 13, 13, 13, 13, 15, 15, 15, 15, 15, 15, 16, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 24, 25, 26, 26, 27, 27, 27]
p6=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 5, 9, 13, 13, 13, 14, 14, 15, 16, 18, 18, 18, 18, 18, 18, 20, 22, 22, 22, 22, 22, 22, 22, 22, 22, 24, 24, 24, 24, 30, 31, 31, 34, 34, 36, 36, 36, 36, 36, 37, 39, 39]
p7=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, 4, 11, 14, 17, 20, 20, 20, 23, 23, 23, 23, 24, 24, 24, 25, 25, 27, 27, 28, 28, 28, 30, 36, 36, 36, 36, 36, 36, 37, 37, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40]
p8=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 6, 8, 10, 15, 32, 38, 39, 40, 40, 41, 41, 41, 41, 41, 42, 42, 44, 44, 44, 44, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46]
p9=[0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 5, 6, 8, 8, 8, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14, 20, 24, 35, 42, 51, 62, 72, 82, 89, 92, 94, 101, 110, 120, 130, 141, 153, 163, 175, 178, 185, 200, 211, 231, 262, 277, 295, 309, 321, 337, 351, 366, 387, 401, 412, 433, 448, 463, 476, 499, 516, 539, 562, 572, 580, 590]
p10=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 7, 7, 15, 19, 20, 21, 23, 25, 26, 27, 28, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 32, 34, 36, 36, 38, 39, 41, 42, 45, 45, 47]
p11=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 3, 5, 5, 8, 9, 10, 11, 11, 11, 11, 12, 15, 20, 24, 25, 25, 27, 27, 27, 28, 31, 31, 31, 32, 33, 33, 34, 35, 37, 38, 38, 39, 41, 41, 41, 44, 44, 44, 44, 45, 45, 45, 45, 45]
p12=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 12, 35, 55, 68, 78, 81, 82, 86, 90, 92, 98, 102, 104, 112, 114, 115, 115, 115, 115, 115, 118, 118, 119, 119, 120, 120, 120, 123, 124, 124, 126, 127, 127, 128, 131, 133, 134, 135, 135, 136, 137]
p13=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 13, 13, 14, 14, 15, 15, 16, 16, 16]
p14=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 3, 3, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 8, 8, 8, 8, 9, 9, 9, 12, 14, 15, 15, 15, 15, 15]
p15=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 25, 28, 145, 170, 193, 249, 317, 345, 409, 488, 556, 624, 685, 774, 861, 984, 1049, 1082, 1107, 1117, 1135, 1143, 1147, 1153, 1157, 1164, 1169, 1178, 1190, 1203, 1243, 1254, 1256, 1257, 1262, 1274, 1283, 1285, 1287, 1298, 1300, 1302, 1304, 1309, 1310, 1314, 1316, 1317]
p16=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 7, 7, 17, 20, 23, 34, 43, 49, 59, 63, 64, 64, 65, 74, 77, 82, 83, 83, 83, 84, 85, 85, 85, 85, 85, 86, 86, 86, 87, 87, 88, 89, 90, 90, 90, 91, 91, 94, 95, 96, 100, 106, 107, 108, 109, 111, 112]
p17=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 7, 8, 8, 9, 9, 9, 9, 9, 9, 12, 12, 12]
axis=plt.gca()
starttime=datetime.datetime(2020,1,19)
endtime=datetime.datetime(2020,4,7)
jianju=datetime.timedelta(days=1)
daes=mat.dates.drange(starttime,endtime,jianju)
axis=plt.gca()
dateFormat=mat.dates.DateFormatter('%Y-%m-%d')
axis.xaxis.set_major_formatter(dateFormat)
axis.plot(daes,p1)
axis.plot(daes,p2)
axis.plot(daes,p3)
axis.plot(daes,p4)
axis.plot(daes,p5)
axis.plot(daes,p6)
axis.plot(daes,p7)
axis.plot(daes,p8)
axis.plot(daes,p9)
axis.plot(daes,p10)
axis.plot(daes,p11)
axis.plot(daes,p12)
axis.plot(daes,p13)
axis.plot(daes,p14)
axis.plot(daes,p15)
axis.plot(daes,p16)
axis.plot(daes,p17)
legend=plt.legend(["Seoul","Busan","Daegu","Incheon","Gwangju","Daejeon","Ulsan","Sejong","Gyeonggi-do","Gangwon-do","Chungcheongbuk-do","Chungcheongnam-do","Jeollabuk-do","Jeollanam-do","Gyeongsangbuk-do","Gyeongsangnam-do","Jeju-do"])
plt.xlabel('Date')
plt.ylabel('Different_province_confirmed')
Check the situation,and extract columns that may needed.
#Read file and check the situation
import pandas as pd
patientInfo = pd.read_csv('PatientInfo.csv')
print(patientInfo.shape)
#patientInfo.head(5)
print(patientInfo.info())
print(patientInfo.isnull().any(axis=0))
#extrect the columns needed
sub_set = patientInfo[['sex','age','province','city','disease','infection_order','confirmed_date','released_date','deceased_date','state']]
#missing values
#method1
for i in range(len(sub_set)):
if pd.isna(sub_set['age'][i]):
sub_set = sub_set.drop(i)
#method2
sub_set['sex'] = sub_set['sex'].fillna('233')
na_loc = sub_set[(sub_set.sex == '233')].index.tolist()
sub_set = sub_set.drop(na_loc)
# fill NA
sub_set['disease'] = sub_set['disease'].fillna('False')
print(sub_set['infection_order'].value_counts())# Check its mode
sub_set['infection_order'] = sub_set['infection_order'].fillna(1.0) #fill na with mode
#Extract the columns needed again, mapping objects to float type or int64 type.
sub_set1 = sub_set[['sex','age','disease','infection_order','province','state']]
sub_set1.head(10)
#print(sub_set1['state'].value_counts())
province = sub_set1.loc[:,'province'].drop_duplicates().values.tolist()
pro_dict = dict(zip(province,range(0,len(province))))
#pro_dict
TF_dict = {True:1,'False':0}
sex_dict = {'male':0,'female':1}
age_dict = {'0s':0,'10s':1,'20s':2,'30s':3,'40s':4,'50s':5,'60s':6,'70s':7,'80s':8,'90s':9,'100s':10,'66s':6}
state_dict = {'isolated':1,'released':1,'deceased':0}
sub_set1['sex'] = sub_set1['sex'].map(sex_dict)
sub_set1['age'] = sub_set1['age'].map(age_dict)
sub_set1['disease'] = sub_set1['disease'].map(TF_dict)
sub_set1['province'] = sub_set1['province'].map(pro_dict)
sub_set1['state'] = sub_set1['state'].map(state_dict)
Show the data processing result: 5 features with 1 label
sub_set1.head(5)
from sklearn.model_selection import train_test_split
y = sub_set1['state'] #traning result
del sub_set1['state']
X = sub_set1 #transing
df_trainx,df_testx, df_trainy, df_testy = train_test_split(X, y, test_size=0.3, random_state=0)
print(df_trainy.value_counts()/len(df_trainy))
print(df_testy.value_counts()/len(df_testy))
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
model = DecisionTreeClassifier(criterion='entropy',splitter = 'best',max_depth=5)
#model = KNeighborsClassifier(n_neighbors = 1)
model.fit(df_trainx, df_trainy)
quility_test_pred = model.predict(df_testx)
print("train score:", model.score(df_trainx, df_trainy))
print("test score:", model.score(df_testx, df_testy))
from sklearn import metrics
print(metrics.classification_report(df_testy, quility_test_pred))
print(metrics.confusion_matrix(df_testy, quility_test_pred))
print(metrics.accuracy_score(df_testy, quility_test_pred))
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn import tree
import pydot
#dot_data = StringIO()
#tree.export_graphviz(model, out_file = dot_data, feature_names = df_trainx.columns,
# class_names = ['no_default', 'default'], filled = True, rounded = True, special_characters = True)
#graph = pydot.graph_from_dot_data(dot_data.getvalue())
#graph[0].write_png("credit.png")
#Image(graph[0].create_png())
from IPython.display import Image
from sklearn import tree
import pydotplus
#regr_1 和regr_2
dot_data = tree.export_graphviz(model, out_file=None, #regr_1 是对应分类器
feature_names=df_trainx.columns, #对应特征的名字
class_names= ['sorry', 'alive'], #对应类别的名字
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('example.png') #保存图像
Image(graph.create_png())
# Extract the columns needed again,deal with missing value.
sub_set2 = sub_set[['sex','age','disease','infection_order','province','confirmed_date','released_date']]
sub_set2['released_date'] = sub_set2['released_date'].fillna('466')
N_released = sub_set2[(sub_set2.released_date =='466')].index.tolist()
sub_set2 = sub_set2.drop(N_released)
print(sub_set2.info())
#Get recovery time
import numpy as np
from pandas import to_datetime
from datetime import datetime
sub_set2.head(20)
start_date = to_datetime(sub_set2.confirmed_date,format="%Y-%m-%d")
end_date = to_datetime(sub_set2.released_date,format="%Y-%m-%d")
#print(start_date)
#print(end_date)
recover_time = end_date - start_date
recover_time = recover_time.map(lambda x: x/np.timedelta64(1,'D'))
sub_set2['recover_time'] = recover_time
sub_set2 = sub_set2.drop(columns=['confirmed_date','released_date'])
sub_set2.head(5)
#mapping objects to float type or int64 type.
province = sub_set2.loc[:,'province'].drop_duplicates().values.tolist()
pro_dict = dict(zip(province,range(0,len(province))))
pro_dict
TF_dict = {True:1,'False':0}
sex_dict = {'male':0,'female':1}
age_dict = {'0s':0,'10s':1,'20s':2,'30s':3,'40s':4,'50s':5,'60s':6,'70s':7,'80s':8,'90s':9,'100s':10,'66s':6}
state_dict = {'isolated':1,'released':1,'deceased':0}
sub_set2['sex'] = sub_set2['sex'].map(sex_dict)
sub_set2['age'] = sub_set2['age'].map(age_dict)
sub_set2['disease'] = sub_set2['disease'].map(TF_dict)
sub_set2['province'] = sub_set2['province'].map(pro_dict)
Show the data processing result: 5 features with 1 label
sub_set2.head(5)
from sklearn.model_selection import train_test_split
y = sub_set2['recover_time'] #训练结果
del sub_set2['recover_time']
X = sub_set2 #训练数据
df_trainx,df_testx, df_trainy, df_testy = train_test_split(X, y, test_size=0.3, random_state=0)
print(df_trainy)
print(df_testy)
from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.linear_model import LinearRegression
model = LassoCV()
#model = LinearRegression()
model.fit(X, y) # linear regression model
print('matrix:\n',model.coef_)
print('model:\n',model)
predicted = model.predict(df_testx)
from sklearn.metrics import r2_score
r2_score(df_testy, predicted)
Interpretation for classification part
I also tried Naive Bayes and KNN classifier,found that decision tree performs best.
In these 3 classifiers, the decreased situation are all undervalued, at first I assumed that it's because I counted those isolated into alive case,however part of them may decrease soon. then I tried to remove those isolation cases,the model only performed worse.
Another origin of misclassification may come from the encode method of feature-province ,I just encoded it with the present order, I hope to optimize this part after gaining the the point for each cities.
Accroding to precision and recall,the decreased probability is undervalued, but still the result still has value since it has a good overall score.
Interpretation for regression part
Region = pd.read_csv('Region.csv')
del Region['code']
del Region['latitude']
#sub_set
merge_result = pd.merge(sub_set, Region, on='city')
#merge_result
# found contradiction in 2 colums of provience
sub_set3 = merge_result.copy(deep=True)
dllist = ['province_y','deceased_date','released_date','confirmed_date','city','state','province_x','longitude']
for item in dllist:
del sub_set3[item]
sub_set3
#mapping using same procedure
# TF_dict = {True:1,'False':0}
# sex_dict = {'male':0,'female':1}
# age_dict = {'0s':0,'10s':1,'20s':2,'30s':3,'40s':4,'50s':5,'60s':6,'70s':7,'80s':8,'90s':9,'100s':10,'66s':6}
# state_dict = {'isolated':1,'released':1,'deceased':0}
sub_set3['sex'] = sub_set3['sex'].map(sex_dict)
sub_set3['age'] = sub_set3['age'].map(age_dict)
sub_set3['disease'] = sub_set3['disease'].map(TF_dict)
#sub_set3['state'] = sub_set3['state'].map(state_dict)
# stdlist = ['elementary_school_count','kindergarten_count','university_count',
# 'academy_ratio','elderly_population_ratio','elderly_alone_ratio','nursing_home_count']
# for item in stdlist:
# sub_set3 = (sub_set3-sub_set3.mean())/(sub_set3.std())
sub_set3 = (sub_set3-sub_set3.min())/(sub_set3.max()-sub_set3.min())
sub_set3.head(5)
#finish
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
newsub_set3 = pca.fit_transform(sub_set3)
print(pca.explained_variance_ratio_)
newsub_set3
In this task,I mainly analyze region.csv and take some characteristics to do the clustering process so that I can set different cities as different risk levels.
Deal with Region.csv
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
loan_data = pd.read_csv(r"C:\Users\hp\Desktop\Region.csv")
Fistly,I count the number of Patients in every city from PatientInfo.csv and set a new column called 'counts'.Then I use 'counts' and 'nusing_home_count' to do K-means cluster to find something about risk level.
Then I need to find the most appropriate 'k' value by using Elbow Method
import matplotlib.pyplot as plt
#转换成numpy array
X = np.array(loan_data[['counts','nursing_home_count']])
distance = []
k = []
#簇的数量
for n_clusters in range(1,19):
cls = KMeans(n_clusters).fit(X)
#曼哈顿距离
def manhattan_distance(x,y):
return np.sum(abs(x-y))
distance_sum = 0
for i in range(n_clusters):
group = cls.labels_ == i
members = X[group,:]
for v in members:
distance_sum += manhattan_distance(np.array(v), cls.cluster_centers_[i])
distance.append(distance_sum)
k.append(n_clusters)
plt.scatter(k, distance)
plt.plot(k, distance)
plt.xlabel("k")
plt.ylabel("distance")
plt.show()
From the picture above,I choose 5 as 'k' value
loan = np.array(loan_data[['counts','nursing_home_count']])
clf=KMeans(n_clusters=5)
clf=clf.fit(loan)
#在原始数据表中增加聚类结果标签
loan_data['label']=clf.labels_
plt.figure(figsize=(10,8))
loan_data0=loan_data.loc[loan_data["label"] == 0]
loan_data1=loan_data.loc[loan_data["label"] == 1]
loan_data2=loan_data.loc[loan_data["label"] == 2]
loan_data3=loan_data.loc[loan_data["label"]==3]
loan_data4=loan_data.loc[loan_data["label"]==4]
#绘制聚类结果的散点图
plt.rc('font', family='STXihei', size=10)
plt.scatter(loan_data0['counts'],loan_data0['nursing_home_count'],50,color='#99CC01',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data1['counts'],loan_data1['nursing_home_count'],50,color='#FE0000',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data2['counts'],loan_data2['nursing_home_count'],50,color='#0000FE',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data3['counts'],loan_data3['nursing_home_count'],50,color='darkorange',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data4['counts'],loan_data4['nursing_home_count'],50,color='hotpink',marker='+',linewidth=2,alpha=0.8)
plt.xlabel('Patient_counts')
plt.ylabel('nursing_home_count')
plt.xlim(0,130)
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='both',alpha=0.4)
plt.show()
From the clustering results above,I can set different clusters as different risk levels about patient_counts and nuring_hone_count.However,there are some cities with 0 patient_counts,so I consider them as lowest risk level temporarily.
from sklearn import metrics
print("Silhouette Coefficient:", metrics.silhouette_score(loan, clf.labels_, metric='euclidean'))
Model assement:The silhouette cofficient is suitable for this classification.
Next,since schools are the most vulnerable to clusters of infections,I plus elementary_school_count,kindergarten_count and university_count as a new column as 'all_school_counts' for each city.I consider all_school_counts and patient_counts.
import matplotlib.pyplot as plt
#转换成numpy array
X = np.array(loan_data[['all_school_counts','counts']])
distance = []
k = []
#簇的数量
for n_clusters in range(1,19):
cls = KMeans(n_clusters).fit(X)
#曼哈顿距离
def manhattan_distance(x,y):
return np.sum(abs(x-y))
distance_sum = 0
for i in range(n_clusters):
group = cls.labels_ == i
members = X[group,:]
for v in members:
distance_sum += manhattan_distance(np.array(v), cls.cluster_centers_[i])
distance.append(distance_sum)
k.append(n_clusters)
plt.scatter(k, distance)
plt.plot(k, distance)
plt.xlabel("k")
plt.ylabel("distance")
plt.show()
I choose 5 as 'k' value
loan = np.array(loan_data[['all_school_counts','counts']])
#设置类别为5
clf=KMeans(n_clusters=5)
#将数据代入到聚类模型中
clf=clf.fit(loan)
#在原始数据表中增加聚类结果标签
loan_data['label']=clf.labels_
plt.figure(figsize=(10,8))
loan_data0=loan_data.loc[loan_data["label"] == 0]
loan_data1=loan_data.loc[loan_data["label"] == 1]
loan_data2=loan_data.loc[loan_data["label"] == 2]
loan_data3=loan_data.loc[loan_data["label"]==3]
loan_data4=loan_data.loc[loan_data["label"]==4]
#绘制聚类结果的散点图
plt.rc('font', family='STXihei', size=10)
plt.scatter(loan_data0['all_school_counts'],loan_data0['counts'],50,color='#99CC01',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data1['all_school_counts'],loan_data1['counts'],50,color='#FE0000',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data2['all_school_counts'],loan_data2['counts'],50,color='#0000FE',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data3['all_school_counts'],loan_data3['counts'],50,color='darkorange',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data4['all_school_counts'],loan_data4['counts'],50,color='hotpink',marker='+',linewidth=2,alpha=0.8)
plt.xlabel('all_school_counts')
plt.ylabel('patient_counts')
plt.xlim(0,1500)
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='both',alpha=0.4)
plt.show()
According to the clustering results,I set different risk levels and combine the last clustering result doing comprehensive assessment to set cities as diiferent segregations.
from sklearn import metrics
print("Silhouette Coefficient:", metrics.silhouette_score(loan, clf.labels_, metric='euclidean'))
Model assement:The silhouette coefficient is suitable.These data can be an important reference for government.
This depends on the social network, the public consensus about the emergency events, and more importantly the self-protection awareness of the people, and the government policy, etc. Try to integrate some of these effects into your model, and make a prediction of the trend of the infection numbers. You can use any model you like. The information from the data files ‘Weather.csv’, ‘SearchTrend.csv’, etc.could be used. Perhaps network model and time series analysis can be good for this task. The dynamical model based on differential equations are also welcome.
Since Covid-19 cause typical infections I Applying SERI model which is a classic model for infectious disease to analyse those data, the interpretation is as below:
S: Susceptible Population, here means those unaffected Korea people
E:Population that have been infected but not shown symptom, having the ability to infect others,may convert to patients or simply automatically recovered.
I:Infected population, is sured to quit infected state after several days, turning into R population.
R:Population that won't infect or be infected , here means those recovered and dead.
a: an E/I infects a people per day, shown as a = kS to introduce the effect of S population.
b: Percentage of E turning into I
Dailyadd = a(I+E)
dS= -a(I+E)
dE = a(I+E)- Dailyadd(8 days ago*)
dI = b×Dailyadd(8 days ago) - b×Dailyadd(18 days ago*)
dR = (1-b)Dailyadd(8 days ago)+ b×Dailyadd(18 days ago)
* : According to the information on the internet,the Covid-19 have average 8 days of incubation. According to the given data, the patient takes 18 days averagely to recover(or die).
import pandas as pd
Case = pd.read_csv('SearchTrend.csv')
search = Case['coronavirus'].tolist()
totalsearch = [0]
for index in range(len(search)):
totalsearch.append(totalsearch[index]+search[index])
searchpoint = totalsearch[-61:]
minpoint = searchpoint[0]
maxpoint = searchpoint[60]
coefficient = []
for index in range(61):
coefficient.append(1-((searchpoint[index]-minpoint)/(maxpoint-minpoint)*0.99))
#Assume that if people are fully awared , 99% of the infections can be avoid.
print(searchpoint)
#the first patient is found on Feb 5th, so the data starts from Feb 5th
The part above is added after the model has been built, which introduce peoples's awareness of the situation. (Shown as coefficient)
The awareness is graded by the sum-up search amount calculated from 'SearchTrend.csv'
N = 52230000 #Overall Korea population
#initialize
S = [N]
E = [1]
I = [1]
R = [0]
Dayadd = [0]
totalcon = [0]
I divided the infectious into 3 stage, according to the incubating time and recovery time:
a = 0.7
k = a/N
b = 0.6
Tr = 1
#Stage 1
for index in range(8):
add = k*S[index]*(I[index]+E[index])
S.append(S[index]-add)
E.append(E[index]+add)
I.append(I[index]-0) #没有意义 因为没有算康复人数
R.append(0)
Dayadd.append(add)
totalcon.append(sum(Dayadd))
#Stage 2
k_ad = k*0.5 #government policy but the infection rate by half
for index in range(8,18):
#i/10
add = coefficient[index]*k_ad*S[index]*(I[index]+E[index])
S.append(S[index]-add)
E.append(E[index]+add-Dayadd[index-8])
I.append(I[index]+b*Dayadd[index-8]-0)
R.append(R[index]+(1-b)*Dayadd[index-8])
Dayadd.append(add)
totalcon.append(sum(Dayadd))
print(len(I))
Before step into the 3 stage, check the first two stages:
import matplotlib.pyplot as plt
f=plt.figure(facecolor = 'white')
plt.plot(range(19),I)
plt.show()
#Stage 3
for index in range(18,60):
add = coefficient[index]*k_ad*S[index]*(I[index]+E[index])
S.append(S[index]-add)
E.append(E[index]+add-Dayadd[index-8])
I.append(I[index]+b*Dayadd[index-8]-b*Dayadd[index-18])
R.append(R[index]+(1-b)*Dayadd[index-8]+b*Dayadd[index-18])
Dayadd.append(add)
totalcon.append(sum(Dayadd))
# here introduce the data in reality, a direct copy from my partners' work
confirmed=[1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 6, 11, 12, 15, 15, 16, 18, 23, 24, 24, 27, 27, 28, 28, 28, 28, 28, 29, 30, 31, 51, 104, 204, 433, 602, 833, 977, 1261, 1766, 2337, 3150, 3736, 4212, 4812, 5328, 5766, 6284, 6767, 7134, 7382, 7513, 7755, 7869, 7979, 8086, 8126, 8236, 8320, 8413, 8565, 8652, 8799, 8897, 8961, 9037, 9137, 9241, 9332, 9478, 9583, 9661, 9786, 9887, 9976, 10062, 10156, 10237, 10284, 10331]
Plot the assumed situation and real situation.
confirmedFC = confirmed[-61:]
f=plt.figure(figsize=(15,8),facecolor = 'white')
plt.plot(range(61),I)
plt.plot(range(61),totalcon)
#plt.plot(range(61),R)
plt.plot(range(61),confirmedFC)
legend=plt.legend(["Patients",'assumed confirmed','confirmed'])
plt.show()
#index =26
#print(k*S[index]*(I[index]+E[index]))
According to the picture, the situation under the idealized model is shown in yellow and blue,while the true situation is in green.
The model beautifully fit the situation at the first 30 days, and it seem that the data reaches its peak faster in model than in reality. I suppose this part of difference is caused by suspected case( like the early phase in Wuhan), which is not yet been digested by the medical system.
Since the line is increased linearly from the 30th to 60th days, I assume the examing capacity of the medical system is a constant.
Directly use least square to find the slope (digesting rate).
apart = confirmedFC[30:60]
f=plt.figure(facecolor = 'white')
plt.plot(range(30),apart)
plt.show()
def liner_fitting(data_x,data_y):
size = len(data_x);
i=0
sum_xy=0
sum_y=0
sum_x=0
sum_sqare_x=0
average_x=0;
average_y=0;
while i<size:
sum_xy+=data_x[i]*data_y[i];
sum_y+=data_y[i]
sum_x+=data_x[i]
sum_sqare_x+=data_x[i]*data_x[i]
i+=1
average_x=sum_x/size
average_y=sum_y/size
return_k=(size*sum_xy-sum_x*sum_y)/(size*sum_sqare_x-sum_x*sum_x)
return_b=average_y-average_x*return_k
return [return_k,return_b]
solution = liner_fitting(range(30),apart)
print(solution)
print((totalcon[60]-solution[1])/solution[0])#about 56 days from the 30th day
print(Dayadd[60])
Since the new case present after the 60th day is small, and continuedly decrease.I recard it as none changed.
From the calculation above, it takes 56 days for the green line(reality situation) to surpass the yellow line, That means after about 26 days from April 4th, the confirmed case in Korea may stop to increase if they consist with their epidemic prevention strategies.
Analyze the location of the cases and patients, draw some conclusions Although different cities have different situations, citizens are not simply divided by the boundary of cities. According to the location info (latitude and longitude) of the files 'Case.csv' and 'PatientRoute.csv',can you find something hidden conclusions?For example, patients are grouped in some specific area,or someone may cause the infection of two groups of people,or cut the route of two cities that may ease the infection.
Our work mainly focus on the visualization of those data related to geographic position.
Firstly,Find several large clusters by latitude and longitude of PatientRoute.csv
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
loan_data = pd.read_csv(r"C:\Users\hp\Desktop\PatientRoute.csv")
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#转换成numpy array
X = np.array(loan_data[['latitude','longitude']])
distance = []
k = []
#簇的数量
for n_clusters in range(1,19):
cls = KMeans(n_clusters).fit(X)
#曼哈顿距离
def manhattan_distance(x,y):
return np.sum(abs(x-y))
distance_sum = 0
for i in range(n_clusters):
group = cls.labels_ == i
members = X[group,:]
for v in members:
distance_sum += manhattan_distance(np.array(v), cls.cluster_centers_[i])
distance.append(distance_sum)
k.append(n_clusters)
plt.scatter(k, distance)
plt.plot(k, distance)
plt.xlabel("k")
plt.ylabel("distance")
plt.show()
I choose 7 as 'k' value.
loan = np.array(loan_data[['latitude','longitude']])
clf=KMeans(n_clusters=7)
clf=clf.fit(loan)
#在原始数据表中增加聚类结果标签
loan_data['label']=clf.labels_
plt.figure(figsize=(10,8))
loan_data0=loan_data.loc[loan_data["label"]==0]
loan_data1=loan_data.loc[loan_data["label"]==1]
loan_data2=loan_data.loc[loan_data["label"]==2]
loan_data3=loan_data.loc[loan_data["label"]==3]
loan_data4=loan_data.loc[loan_data["label"]==4]
loan_data5=loan_data.loc[loan_data["label"]==5]
loan_data6=loan_data.loc[loan_data["label"]==6]
#绘制聚类结果的散点图
plt.rc('font', family='STXihei', size=10)
plt.scatter(loan_data0['latitude'],loan_data0['longitude'],50,color='#99CC01',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data1['latitude'],loan_data1['longitude'],50,color='#FE0000',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data2['latitude'],loan_data2['longitude'],50,color='#0000FE',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data3['latitude'],loan_data3['longitude'],50,color='darkorange',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data4['latitude'],loan_data4['longitude'],50,color='hotpink',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data5['latitude'],loan_data5['longitude'],50,color='slategray',marker='+',linewidth=2,alpha=0.8)
plt.scatter(loan_data6['latitude'],loan_data6['longitude'],50,color='yellow',marker='+',linewidth=2,alpha=0.8)
plt.xlabel('latitude')
plt.ylabel('longtitude')
plt.xlim(33,39)
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='both',alpha=0.4)
plt.show()
The clustering results show different distribution of patient_route which can help government formulate guidelines for controlling the epidemic by these big clusters.
from sklearn import metrics
print("Silhouette Coefficient:", metrics.silhouette_score(loan, clf.labels_, metric='euclidean'))
Model assement:The silhouette coefficient is good.So it can be an important factor to quarantine these areas so that avoiding case beginning to be worse.
Then we deal with case.csv and PatientRoute.csv at the same time.
Also deleted those rows with empty value
import pandas as pd
Case = pd.read_csv('Case.csv')
print(Case.shape)
print(Case.info())
Case.head(5)
# delete NA
na_loc = Case[(Case.latitude == '-')].index.tolist()
Case = Case.drop(na_loc)
#print(Case['confirmed'].value_counts())
PatientRoute = pd.read_csv('PatientRoute.csv')
#print(PatientRoute.shape)
#print(PatientRoute.info())
PatientRoute.head(5)
To get the coupled longitude and latitude data, and to count each couples' repeated time.
PatientRoute['Lo_La']=PatientRoute[['longitude','latitude']].apply(lambda x:"{:.4f},{:.4f}".format(*x),axis=1)
Lo_La_list = PatientRoute['Lo_La'].tolist()
Lo_La_set = set(Lo_La_list)
Lo_La_t = {}
for item in Lo_La_set:
Lo_La_t[item] = 0
for item in Lo_La_list:
Lo_La_t[item] +=1
case_lo = Case['longitude'].tolist()
case_la = Case['latitude'].tolist()
case_t = Case['confirmed'].tolist()
case_c = Case['infection_case'].tolist()
#Lo_La_t
LOLA = list(Lo_La_t)
time = list(Lo_La_t.values())
Lo_La = []
for item in LOLA:
Lo_La.append(item.split(','))
#print(Lo_La)
#longitude and latitude are well placed here.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
#左下又上
plt.rcParams['figure.figsize'] = (22.0, 18.0)
map = Basemap(llcrnrlon =126, llcrnrlat = 34, urcrnrlon = 130, urcrnrlat=38.5)
map.readshapefile('./mapsituation/gis_osm_roads_free_1','state',color="grey") #this allow the road in Korea been painted .
map.drawcoastlines()
map.drawcountries()
for index in range(len(case_lo)):
if case_t[index]>50:
plt.scatter(float(case_lo[index]),float(case_la[index]),s = case_t[index]*3,color = 'm')
plt.annotate(case_c[index],xy=(float(case_lo[index]),float(case_la[index])),fontsize= 20+case_t[index]/100)
for index in range(len(Lo_La)):
if time[index]>9:
plt.scatter(float(Lo_La[index][0]),float(Lo_La[index][1]),s = time[index]*3,color = 'blue')
# plt.show()
plt.show()
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
#
plt.rcParams['figure.figsize'] = (22.0, 18.0)
map = Basemap(llcrnrlon =126, llcrnrlat = 34, urcrnrlon = 130, urcrnrlat=38.5)
#map.readshapefile('./mapsituation/gis_osm_roads_free_1','state',color="grey") #this allow the road in Korea been painted .
map.drawcoastlines()
map.drawcountries()
for index in range(len(case_lo)):
if case_t[index]>50:
plt.scatter(float(case_lo[index]),float(case_la[index]),s = case_t[index]*3,color = 'm')
plt.annotate(case_c[index],xy=(float(case_lo[index]),float(case_la[index])),fontsize= 20+case_t[index]/100)
for index in range(len(Lo_La)):
if time[index]>9:
plt.scatter(float(Lo_La[index][0]),float(Lo_La[index][1]),s = time[index]*3,color = 'blue')
# plt.show()
plt.show()